New York City Flights Data Playground
Dataset
flights
arrange(flights, year, desc(month), day)
Take a simple summary of this dataset:
str(flights)
## Classes 'tbl_df', 'tbl' and 'data.frame': 336776 obs. of 19 variables:
## $ year : int 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int 517 533 542 544 554 554 555 557 557 558 ...
## $ sched_dep_time: int 515 529 540 545 600 558 600 600 600 600 ...
## $ dep_delay : num 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
## $ arr_time : int 830 850 923 1004 812 740 913 709 838 753 ...
## $ sched_arr_time: int 819 830 850 1022 837 728 854 723 846 745 ...
## $ arr_delay : num 11 20 33 -18 -25 12 19 -14 -8 8 ...
## $ carrier : chr "UA" "UA" "AA" "B6" ...
## $ flight : int 1545 1714 1141 725 461 1696 507 5708 79 301 ...
## $ tailnum : chr "N14228" "N24211" "N619AA" "N804JB" ...
## $ origin : chr "EWR" "LGA" "JFK" "JFK" ...
## $ dest : chr "IAH" "IAH" "MIA" "BQN" ...
## $ air_time : num 227 227 160 183 116 150 158 53 140 138 ...
## $ distance : num 1400 1416 1089 1576 762 ...
## $ hour : num 5 5 5 5 6 5 6 6 6 6 ...
## $ minute : num 15 29 40 45 0 58 0 0 0 0 ...
## $ time_hour : POSIXct, format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
Common Variable Types
Common Variable Types in DataFrame:
intstands for integersdblstands for doubles or real numberschrstands for character vectors, where strings are included as welldttmis the abbreviation fordata-timesdatestands for dateslglstands for logical or vectors that containTRUEorFALSEfctrstands for factors, which R uses to represent categorical variables with fixed possible values
dplyr Functions
# Select all flights on Jan 1st.
filter(flights, month == 1, day == 1)
# Select all flights in Nov or Dec.
filter(flights, month == 11 | month == 12) # Pay attention to the `|` here, it is not `||`
filter(flights, month %in% c(9, 10)) # Select all flights in Sep or Oct.
filter(flights, between(dep_time, 0, 600)) # Departed between midnight and 6am
cat("[ sqrt(2) ^ 2 == 2 ] ->", sqrt(2) ^ 2 == 2)
## [ sqrt(2) ^ 2 == 2 ] -> FALSE
cat("\n[ near(sqrt(2) ^ 2, 2) ] ->", near(sqrt(2) ^ 2, 2))
##
## [ near(sqrt(2) ^ 2, 2) ] -> TRUE
cat("\n[ 1/49 * 49 == 1 ] ->", 1/49 * 49 == 1)
##
## [ 1/49 * 49 == 1 ] -> FALSE
cat("\n[ near(1/49 * 49, 1) ] ->", near(1/49 * 49, 1))
##
## [ near(1/49 * 49, 1) ] -> TRUE
# Pay attention to the `|` here, which isn't `||`
all(
filter(flights, !(arr_delay > 120 | dep_delay > 120)) ==
filter(flights, arr_delay <= 120, dep_delay <= 120)
)
## [1] TRUE
NA
## [1] NA
NA > 33
## [1] NA
NA - 2
## [1] NA
NA ^ NA
## [1] NA
is.na(NA) # A recommended way to determine whether a value is missing
## [1] TRUE
NA will make almost any operation unknown.
select(flights, year:day) # Same with the below
# select(flights, year, month, day)
select(flights, -(year: day)) # Select all colums except year, month and day
select(flights, starts_with("dep_"))
select(flights, contains("dep_"))
select(flights, contains("dep_"), everything()) # All dep fields moved to the start of the dataframe
select(flights, 出発時間 = dep_time, 予定出発 = sched_dep_time, 出発遅延 = dep_delay)
# Slight Difference between `select` and `rename`
rename(flights, 出発時間 = dep_time, 予定出発 = sched_dep_time, 出発遅延 = dep_delay)
mutate(
select(flights, year:day, ends_with("delay"), distance, air_time),
gain = dep_delay - arr_delay,
speed = distance / air_time * 60,
gain_per_hour = gain / air_time * 60 # Refer to the newly created column
)
transmute(
flights,
gain = dep_delay - arr_delay,
speed = distance / air_time * 60,
gain_per_hour = gain / air_time * 60 # Refer to the newly created column
) # which will drop all original variables in the table
summarise(flights, delay = mean(dep_delay, na.rm = TRUE), count = n())
summarise(
group_by(flights, year, month, day),
delay = mean(dep_delay, na.rm = TRUE),
count = n()
)
# (delay_dataset <- summarise(
# group_by(flights, dest),
# count = n(),
# dist = mean(distance, na.rm = TRUE),
# delay = mean(arr_delay, na.rm = TRUE)
# ))
# (delay_filtered <- filter(delay_dataset, count > 20, dest != "HNL"))
#
# Use Pipe to make the transformation above concise:
(delays <- flights %>%
group_by(dest) %>%
summarise(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")
)
ggplot(delays, aes(x = dist, y = delay)) +
geom_point(aes(size = count), alpha = 1/3) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Visualize the density through count
(delays <- flights
%>% filter(!is.na(dep_delay),!is.na(arr_delay)) # Filter the flights not cancelled
%>% group_by(tailnum)
%>% summarise(
delay = mean(arr_delay),
count = n()
)
)
filter(delays, count > 25)
delays %>%
ggplot(mapping = aes(x = delay)) +
geom_freqpoly(binwidth = 10) +
xlab("delay [min]")
delays %>%
# filter(count > 25) %>% # Filter to few flights to see variation
ggplot(mapping = aes(x = delay, y = count)) +
geom_point(alpha = 1/8) + # Use the stastistic through count
xlab("delay [min]")
# Estimation Upon Average Delay among NYC-Flights
(
flights
%>% filter(!is.na(dep_delay),!is.na(arr_delay)) # Filter the flights not cancelled
%>% group_by(year, month, day)
%>% summarise(
avg_delay_for_all = mean(arr_delay),
avg_delay_for_positive_part = mean(arr_delay[arr_delay > 0])
)
%>% mutate(
date_field = ISOdate(year, month, day) # Attach date type variable
)
%>% ggplot()
+ geom_point(
mapping = aes(x = date_field, y = avg_delay_for_all, colour = "DelayForAll")
)
+ geom_point(
mapping = aes(x = date_field, y = avg_delay_for_positive_part, colour = "DelayForPositive")
)
+ scale_color_manual(
# Set the order in the legend
breaks = c("DelayForPositive", "DelayForAll"),
values = c("DelayForAll" = "brown", "DelayForPositive" = "violetred1")
)
)
(
flights
%>% filter(!is.na(dep_delay),!is.na(arr_delay)) # Filter the flights not cancelled
%>% group_by(dest)
%>% summarise(
distanceAvg = mean(distance),
min = min(distance),
max = max(distance),
median = median(distance),
standardDeviation = sd(distance),
interquartileRange = IQR(distance),
interquartileRange_0 = quantile(distance, 0.75) - quantile(distance, 0.25),
medianAbsoluteDeviation = mad(distance),
prove_quantile1 = mean(distance < quantile(distance, 0.25)) # Get a Proportion
)
%>% arrange(desc(standardDeviation))
)
Measures of spread:
- Mean Squared Deviation: \(\frac{\sum (x-\mu)^2}{N}\)
- \(N\) = Number of Data
- Standard Deviation(Root Mean Squared Deviation): \(\sigma = \sqrt{\frac{\sum (x-\mu)^2}{N}}\)
- \(N\) = Number of Data
Median Absolute Deviation:
\(MAD(D) = median({|d_i - median(D)|})\)
(
flights
%>% filter(!is.na(dep_delay),!is.na(arr_delay)) # Filter the flights not cancelled
%>% group_by(dest)
%>% summarise(
count = n(),
carriers = n_distinct(carrier), # Same as the below
# carriers_0 = length(unique(carrier)),
carriers_list = paste(unique(carrier), collapse = ", ")
)
%>% arrange(desc(carriers))
)
Summary Fist and Last Flights each day
Departure time only of first and last flights each day
(
flights
%>% filter(!is.na(dep_delay),!is.na(arr_delay)) # Filter the flights not cancelled
%>% group_by(year, month, day)
%>% summarise(
first_dep_a = min(dep_time),
first_dep_b = first(dep_time),
last_dep_a = max(dep_time),
last_dep_b = last(dep_time),
n_early = sum(dep_time < 500) # Count flights left before 5am
)
)
Details of first and last flights each day
(
flights
%>% filter(!is.na(dep_delay),!is.na(arr_delay)) # Filter the flights not cancelled
%>% group_by(year, month, day)
%>% mutate(
ranking = min_rank(desc(dep_time)),
ranking_reverse = min_rank(dep_time),
range_r = paste(range(ranking), collapse = " ~ ")
)
%>% filter(ranking %in% range(ranking))
%>% arrange(year, month, day, ranking)
)
Summary Mileage Each Day
count(flights, year, month, day, wt = distance)
count(flights, year, month, day, tailnum, wt = distance)
Popular Destinations
# Filter By Group Without Summary
flights %>%
group_by(dest) %>%
filter(n() > 365) %>%
filter(arr_delay > 0) %>%
mutate(prop_delay = arr_delay / sum(arr_delay)) %>%
select(year: day, dest, arr_delay, prop_delay) %>%
arrange(dest)
Rolling up Summary
(daily <- group_by(flights, year, month, day))
(per_day <- summarise(daily, count = n()))
(per_month <- summarise(per_day, count = sum(count)))
(per_year <- summarise(per_month, count = sum(count)))
daily %>%
ungroup() %>%
summarise(flights = n())